import os
import glob
import pandas as pd
cities = ['Shenzhen', 'Shanghai', 'Guangzhou', 'Chengdu']
dates = ['20201218', '20210205', '20210416']
t = tuple((c, p) for c in cities for p in dates)
for city, date in t[:]:
src = city + '-' + date + '.txt'
with open('PriceData/' + src) as f:
lines = f.readlines()
data = [] # list of [hotel_name, price_in_cny, points]
lines = [line for line in lines if line.strip(' ') != '\n'] # 移除所有空行
for i, line in enumerate(lines):
if '距目的地' in line: # 令人意外地,“距目的地”才是要找的锚点
temp = []
hotel_name = lines[i-4].strip()
temp.append(hotel_name)
try:
price_in_cny = int(lines[i+3].replace(' CNY /每晚 起\n', '').replace(',', '')) # 排除了其他货币
temp.append(price_in_cny)
points = int(lines[i+2].replace(' 积分/住宿\n', '').replace(',', ''))
temp.append(points)
except Exception:
try: # 如果没找到,尝试 offset
price_in_cny = int(lines[i+2].replace(' CNY /每晚 起\n', '').replace(',', ''))
temp.append(price_in_cny)
points = int(lines[i+1].replace(' 积分/住宿\n', '').replace(',', ''))
temp.append(points)
except Exception:
try: # 如果没找到,尝试 offset
price_in_cny = int(lines[i+4].replace(' CNY /每晚 起\n', '').replace(',', ''))
temp.append(price_in_cny)
points = int(lines[i+3].replace(' 积分/住宿\n', '').replace(',', ''))
temp.append(points)
except Exception as err:
# print(name, err)
continue
data.append(temp)
frame = pd.DataFrame(data, columns=('hotel_name', 'price_in_cny', 'points'))
frame['ppr'] = frame['price_in_cny'] / frame['points'] * 12.220643722
os.makedirs('CSVResult', exist_ok=True)
frame.to_csv('CSVResult/' + src.replace('txt', 'csv'))
# 补充清洗过程,将同一个城市不同 date 的数据合并,删除没有数据的酒店
for city in cities[:]:
frame = pd.read_csv('CSVResult/' + city + '-' + dates[0] + '.csv', usecols=['hotel_name', 'ppr']) # 以 low 为初始
frame = frame.merge(
pd.read_csv('CSVResult/' + city + '-' + dates[1] + '.csv',
usecols=['hotel_name', 'ppr']),
on='hotel_name',
suffixes=['_' + dates[0], '']
)
frame = frame.merge(
pd.read_csv('CSVResult/' + city + '-' + dates[2] + '.csv',
usecols=['hotel_name', 'ppr']),
on='hotel_name',
suffixes=['_' + dates[1], '_' + dates[2]]
)
frame.to_csv('CSVResult/{}.csv'.format(city))
import os
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
from matplotlib.font_manager import FontProperties
# 自定义字体(楷体-简)
kai = FontProperties(fname=os.getenv('HOME') + '/Library/Mobile Documents/com~apple~CloudDocs/Syncing/KaitiSC-Regular.ttf')
# 使用自定义模板
mpl.style.use('~/Library/Mobile Documents/com~apple~CloudDocs/Syncing/kvt-academic.mplstyle')
mpl.rcParams['font.size'] = 8
cities = ['Shenzhen', 'Shanghai', 'Guangzhou', 'Chengdu']
cities_chs = ['深圳', '上海', '广州', '成都']
dates = ['20201218', '20210205', '20210416']
colors = ['lightsteelblue', 'lightgreen', 'lightcoral']
# promotions = [0]
promotions = [0, 0.5] # 买 1 加送 promotion
for city in cities[:]:
frame = pd.read_csv('CSVResult/' + city + '.csv', index_col=0)
ax = plt.gca()
title = '{}各酒店价格积分比\n{}(蓝色){}(绿色){}(红色)'.format(cities_chs[cities.index(city)], *dates)
kai.set_size(12)
ax.set_title(title, fontproperties=kai)
# ax.set_ylabel('ppr')
for date in dates[:]:
ax.scatter(*zip(*enumerate(frame['ppr_' + date])), label=date, color=colors[dates.index(date)], alpha=0.5)
ax.set_xticks(range(len(frame)))
# 竖着展示酒店名称
def vertical_label(string):
return '\n'.join(s for s in list(string) if s != ' ')
kai.set_size(8)
ax.set_xticklabels(frame['hotel_name'].map(vertical_label), fontproperties=kai)
# 沿着每个坐标 tick 位置添加辅助竖线方便定位
ax.vlines(range(len(frame)), 0, 5, color='lightgrey', linestyles='dotted')
# 在 y = 1 / (1+promotion) 处添加横向定位
ax.set_xlim(-1, len(frame))
ax.hlines([1 / (1+p) for p in promotions], -1, len(frame), color='k', linestyles='dashed')
# 对 y 轴进一步优化
ax.set_yticks(np.arange(0, 5, 0.05))
ymin = frame.iloc[:, 1:].values.min()
ymax = frame.iloc[:, 1:].values.max()
ax.set_ylim(ymin-0.1, ymax+0.1)
# 保存并展示
plt.savefig(city)
plt.show()